Data analysis

Author
Affiliation
Magnus Johansson
Published

2023-06-12

1 Overview of this file

  • Visualizing distributions
  • Running models
  • Checking assumptions
  • Effects(?)

1.1 Setting up

Read data.

Code
df.long <- read.csv("dataLong.csv")

Let’s load packages/libraries.

Code
# these are mostly for data management/wrangling and visualization
library(tidyverse) # for most things
library(foreign) # for reading SPSS files
library(readxl) # read MS Excel files
library(showtext) # get fonts
library(glue) # simplifies mixing text and code in figures and tables
library(arrow) # support for efficient file formats
library(grateful) # create table+references for packages used in a project
library(styler) # only a one-time installation (it is an Rstudio plugin)
library(car) # for car::recode only
library(skimr) # data skimming
library(lubridate) # for handling dates in data
library(janitor) # for many things in data cleaning

# these are mostly for data analysis and visualization
library(gtsummary)
library(scales)
library(visdat)
library(psych)
library(lme4)
library(nlme)
library(broom.mixed)
library(patchwork)
library(easystats)
library(mice)
library(modelsummary)
library(ggdist)
library(kableExtra)
library(formattable)
library(ggrepel)
library(ggrain)
library(sjPlot)

Define a ggplot theme theme_ki(), a standard table function, kbl_ki(), and a color palette based on KI’s design guide, ki_color_palette.

Code
source("ki.R") # this reads an external file and loads whatever is in it

2 Visualizations

2.1 Distributions

2.1.1 Histogram

Code
df.long %>% 
  filter(Group == "Control") %>% 
  ggplot(aes(x = value, fill = measure)) +
  geom_histogram(binwidth = 4) +
  facet_grid(measure~time) +
  theme_ki() +
  scale_fill_manual(values = ki_color_palette) +
  labs(title = "Outcomes over time",
       subtitle = "Control group",
       x = "",
       y = "Number of respondents")

Code
df.long %>% 
  filter(!Group == "Control") %>% 
  ggplot(aes(x = value, fill = measure)) +
  geom_histogram(binwidth = 4) +
  facet_grid(measure~time) +
  theme_ki() +
  scale_fill_manual(values = ki_color_palette) +
  labs(title = "Outcomes over time",
       subtitle = "Intervention group",
       x = "",
       y = "Number of respondents")

2.1.2 Density

Code
df.long %>% 
  filter(Group == "Control") %>% 
  ggplot(aes(x = value, fill = measure)) +
  geom_density() +
  facet_grid(measure~time) +
  theme_ki() +
  scale_fill_manual(values = ki_color_palette) +
  labs(title = "Outcomes over time",
       subtitle = "Control group",
       x = "",
       y = "Density of respondents")

Code
df.long %>% 
  filter(!Group == "Control") %>% 
  ggplot(aes(x = value, fill = measure)) +
  geom_density() +
  facet_grid(measure~time) +
  theme_ki() +
  scale_fill_manual(values = ki_color_palette) +
  labs(title = "Outcomes over time",
       subtitle = "Intervention group",
       x = "",
       y = "Density of respondents")

What do these distributions look like to you? Normal?

We’ll get back to what this means for our analysis.

2.1.3 Box + violin

Code
df.long %>% 
  ggplot(aes(x = time, y = value, fill = Group)) +
  geom_violin(position = position_dodge(0.9),
              alpha = 0.9) +
  geom_boxplot(position = position_dodge(0.9),
               width = .2,
               notch = TRUE,
               outlier.shape = NA,
               alpha = 0.3) +
  facet_wrap(~measure,
             ncol = 1) +
  theme_ki() +
  labs(title = "Outcomes over time",
       x = "Time point",
       y = "Distribution of outcome measurements")

2.2 Individuals over time

Using the package ggrain we can get this sweet figure that combines a boxplot, a split violin plot, jittered points for individuals, and lines between individuals over time!

Code
df.long %>%
  filter(measure == "DEP") %>% 
  ggplot(aes(x = time, y = value, group = time, fill = Group, color = Group)) +
  geom_rain(
    #alpha = .6,
    boxplot.args = list(color = "black", outlier.shape = NA),
    id.long.var = "id"
  ) +
  scale_fill_brewer(palette = "Dark2",
                    aesthetics = c("color","fill")) +
  theme_ki() +
  facet_wrap(~Group,
             nrow = 2) +
  labs(title = "Depression over time")

Code
df.long %>%
  filter(measure == "DEP") %>% 
  ggplot(aes(x = time, y = value, group = time, fill = Group, color = Group)) +
  geom_rain(
    #alpha = .6,
    boxplot.args = list(color = "black", outlier.shape = NA),
    id.long.var = "id"
  ) +
  scale_fill_brewer(palette = "Dark2",
                    aesthetics = c("color","fill")) +
  theme_ki() +
  facet_wrap(~Group,
             nrow = 2) +
  labs(title = "Depression over time")

3 Linear model 1

We will start with DEP as outcome and fit a simple linear model. But first, we’ll split our measure variable into three separate variables (while retaining time as its own variable).

Code
df.model <- df.long %>% 
  pivot_wider(names_from = "measure",
              values_from = "value") %>% 
  rename(Depression = DEP,
         Anxiety = ANX,
         Stress = STRESS) %>% 
  mutate(time = as.integer(time))
Code
m1 <- lmer(data = df.model,
         Depression ~ time + Group + time*Group + (1 | id),
         REML = TRUE)

3.1 Table

Code
sjPlot::tab_model(m1)
  Depression
Predictors Estimates CI p
(Intercept) 17.01 14.40 – 19.62 <0.001
time -0.93 -1.69 – -0.16 0.017
Group [MiCBT] -1.74 -5.52 – 2.04 0.367
time × Group [MiCBT] -0.92 -2.05 – 0.20 0.108
Random Effects
σ2 36.10
τ00id 70.74
ICC 0.66
N id 106
Observations 368
Marginal R2 / Conditional R2 0.044 / 0.677
Code
tidy(m1)
# A tibble: 6 × 6
  effect   group    term            estimate std.error statistic
  <chr>    <chr>    <chr>              <dbl>     <dbl>     <dbl>
1 fixed    <NA>     (Intercept)       17.0       1.33     12.8  
2 fixed    <NA>     time              -0.927     0.388    -2.39 
3 fixed    <NA>     GroupMiCBT        -1.74      1.92     -0.903
4 fixed    <NA>     time:GroupMiCBT   -0.921     0.572    -1.61 
5 ran_pars id       sd__(Intercept)    8.41     NA        NA    
6 ran_pars Residual sd__Observation    6.01     NA        NA    
Code
glance(m1)
# A tibble: 1 × 7
   nobs sigma logLik   AIC   BIC REMLcrit df.residual
  <int> <dbl>  <dbl> <dbl> <dbl>    <dbl>       <int>
1   368  6.01 -1284. 2581. 2604.    2569.         362

3.2 Predicted response

Code
estimate_expectation(m1, data = "grid") %>% 
  plot() +
  theme_ki()

3.3 Plot parameters

Code
plot(parameters(m1))

3.4 Check assumptions

Code
check_model(m1)

4 Linear model 2

What happens if we define time as a factor?

5 Marginal effects

https://twitter.com/stephenjwild/status/1666056019993034755

link to marginaleffects!